{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "\n",
    "import gym\n",
    "import matplotlib\n",
    "import numpy as np\n",
    "import sys\n",
    "\n",
    "from collections import defaultdict\n",
    "if \"../\" not in sys.path:\n",
    "  sys.path.append(\"../\") \n",
    "from lib.envs.blackjack import BlackjackEnv\n",
    "from lib import plotting\n",
    "\n",
    "matplotlib.style.use('ggplot')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "env = BlackjackEnv()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def create_random_policy(nA):\n",
    "    \"\"\"\n",
    "    Creates a random policy function.\n",
    "    \n",
    "    Args:\n",
    "        nA: Number of actions in the environment.\n",
    "    \n",
    "    Returns:\n",
    "        A function that takes an observation as input and returns a vector\n",
    "        of action probabilities\n",
    "    \"\"\"\n",
    "    A = np.ones(nA, dtype=float) / nA\n",
    "    def policy_fn(observation):\n",
    "        return A\n",
    "    return policy_fn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def create_greedy_policy(Q):\n",
    "    \"\"\"\n",
    "    Creates a greedy policy based on Q values.\n",
    "    \n",
    "    Args:\n",
    "        Q: A dictionary that maps from state -> action values\n",
    "        \n",
    "    Returns:\n",
    "        A function that takes an observation as input and returns a vector\n",
    "        of action probabilities.\n",
    "    \"\"\"\n",
    "    \n",
    "    def policy_fn(observation):\n",
    "        pass\n",
    "        # Implement this!\n",
    "    return policy_fn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def mc_control_importance_sampling(env, num_episodes, behavior_policy, discount_factor=1.0):\n",
    "    \"\"\"\n",
    "    Monte Carlo Control Off-Policy Control using Weighted Importance Sampling.\n",
    "    Finds an optimal greedy policy.\n",
    "    \n",
    "    Args:\n",
    "        env: OpenAI gym environment.\n",
    "        num_episodes: Number of episodes to sample.\n",
    "        behavior_policy: The behavior to follow while generating episodes.\n",
    "            A function that given an observation returns a vector of probabilities for each action.\n",
    "        discount_factor: Gamma discount factor.\n",
    "    \n",
    "    Returns:\n",
    "        A tuple (Q, policy).\n",
    "        Q is a dictionary mapping state -> action values.\n",
    "        policy is a function that takes an observation as an argument and returns\n",
    "        action probabilities. This is the optimal greedy policy.\n",
    "    \"\"\"\n",
    "    \n",
    "    # The final action-value function.\n",
    "    # A dictionary that maps state -> action values\n",
    "    Q = defaultdict(lambda: np.zeros(env.action_space.n))\n",
    "    \n",
    "    # Our greedily policy we want to learn\n",
    "    target_policy = create_greedy_policy(Q)\n",
    "    \n",
    "    # Implement this!\n",
    "        \n",
    "    return Q, target_policy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "random_policy = create_random_policy(env.action_space.n)\n",
    "Q, policy = mc_control_importance_sampling(env, num_episodes=500000, behavior_policy=random_policy)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# For plotting: Create value function from action-value function\n",
    "# by picking the best action at each state\n",
    "V = defaultdict(float)\n",
    "for state, action_values in Q.items():\n",
    "    action_value = np.max(action_values)\n",
    "    V[state] = action_value\n",
    "plotting.plot_value_function(V, title=\"Optimal Value Function\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
